/* 

Paper: Gentrification and pioneer businesses 
Authors: Behrens, Boulam, Martin, Mayneris 

Name dofile: create_db_table4.do  
Version: 16 nov. 2021 

Output: predict_gentri_database.dta

Inputs:

- duns_1990_2010_idconc_naics.dta  (from NETS data)
- naics5.dta (authors computation, publicly available)
- naics4.dta (authors computation, publicly available)
- pioneers_final_negbin.dta (from identify_pioneers_negbin.do)
- growth_est_9000.dta (authors computation, publicly available data)
- id_conc_database.dta (from create_db_concordedblocks.do)
- crime_rate.dta (GIS, publicly available)
- id_conc_disc_250m.dta (GIS, publicly available)
- id_conc_disc_500m.dta (GIS, publicly available)
- id_conc_neighbors_mapping.dta (GIS, publicly available)
- id_conc_transport.dta (GIS, publicly available)
- distance_idconc_water.dta (GIS, publicly available)
- landmark_area_distance_id_conc.dta (GIS, publicly available)
- landmarks_id_conc_count.dta (GIS, publicly available)
- id_conc_pluto.dta (authors computations, GIS, publicly available)
- id_conc_pop_income.dta (from prepare_variables4reg.do)
- blocks_couldgentrify.dta (from identify_gentri_blocks.do)
- blocks_gentrified.dta (from identify_gentri_blocks.do)
*/ 


********************************************
*Creation of plants counts at the ring-level
********************************************

use duns_1990_2010_idconc_naics.dta, clear

drop if id_conc==.|year==.

tostring naics, replace

merge m:1 naics using naics5.dta
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

merge m:1 naics using naics4.dta
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

gen toto=length(naics)
tab toto
keep if toto==6
drop toto

*Merge with information on pioneer sectors
merge m:1 naics using pioneers_final_negbin.dta 
rename pioneer_high_negbin pioneer
replace pioneer=0 if _merge==1
drop _merge 

*We merge with the sectoral growth rates for the Bartik instruments 
*(note that some sectors, mainly in the construction sector, are in 
*the NY Nets but not in the sect growth rates datasets)

destring naics, replace
merge m:1 naics using growth_est_9000.dta
drop if _merge==2
drop _merge

gen plants=1 
gen plants_instr=(1+g_total_9000) if year==1990 

bys id_conc year: egen plants_pion=sum(plants) if pioneer==1
bys id_conc year: egen var=max(plants_pion) 
replace plants_pion=var if pioneer==0
replace plants_pion=0 if pioneer==0&var==.
drop var

bys id_conc year: egen plants_npion=sum(plants) if pioneer==0
bys id_conc year: egen var=max(plants_npion) 
replace plants_npion=var if pioneer==1
replace plants_npion=0 if pioneer==1&var==.
drop var

bys id_conc year: egen plants_pion_instr=sum(plants_instr) if pioneer==1
bys id_conc year: egen var=max(plants_pion_instr) 
replace plants_pion_instr=var if pioneer==0
replace plants_pion_instr=0 if pioneer==0&var==.
drop var

keep id_conc year plants_pion plants_npion plants_pion_instr 

duplicates drop id_conc year, force

egen t=group(year)

fillin id_conc t

bys t: egen var=max(year)
replace year=var if year==.
drop var 

foreach v of var plants_*{
replace `v'=0 if `v'==.
}

*For the instrument in 2000, it is the value in 1990 that we must take in reality (stock 1990*growth rate)
tsset id_conc t
replace plants_pion_instr=l.plants_pion_instr if t==2
replace plants_pion_instr=. if t==1|t==3


drop _fillin

saveold pioneer_estab.dta, replace v(13)

**********
*Merge all
**********

use duns_1990_2010_idconc_naics.dta, clear

drop if id_conc==.|year==.

tostring naics, replace

merge m:1 naics using naics5.dta
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

merge m:1 naics using naics4.dta
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

gen toto=length(naics)
tab toto
keep if toto==6
drop toto

gen plants=1

collapse (sum) emp plants, by(id_conc year)

gen t=1 if year==1990
replace t=2 if year==2000
replace t=3 if year==2010

merge 1:1 id_conc t using id_conc_database.dta 
replace plants=0 if _m==2
replace emp=0 if _m==2

keep id_conc t year emp plants

merge 1:1 id_conc t using pioneer_estab.dta
drop if _m==2 


foreach v of var plants_pion plants_npion{
replace `v'=0 if _m==1
}

replace plants_pion_instr=0 if _m==1&t==2

drop _m 

gen plants_tot_pion=plants_npion+plants_pion

pwcorr plants plants_tot*
*Correlation equal to 1

drop plants_tot_pion

bys t: egen var=max(year)
replace year=var if year==.
drop var

saveold plants_id_conc.dta, v(13) replace

erase pioneer_estab.dta
 


***********************************************
******************Crime data*******************
******We compute simple averages for rings*****
*************due to 0 pop id_conc**************
************************************************


use crime_rate.dta, clear
keep year id_conc burglary rape murder robbery violent property
saveold crime.dta, replace v(13)

use crime.dta, clear

rename id_conc id_conc_disc250m

joinby id_conc_disc250m using id_conc_disc_250m.dta


collapse (mean) burglary250=burglary rape250=rape murder250=murder robbery250=robbery violent250=violent property250=property, by(id_conc year) fast

saveold crime250, replace v(13)


use crime.dta, clear

rename id_conc id_conc_disc500m

joinby id_conc_disc500m using id_conc_disc_500m.dta

collapse (mean) burglary500=burglary rape500=rape murder500=murder robbery500=robbery violent500=violent property500=property, by(id_conc year) fast

saveold crime500, replace v(13)


use crime.dta, clear

rename id_conc id_conc_neighbors

joinby id_conc_neighbors using id_conc_neighbors_mapping.dta

collapse (mean) burglarycont=burglary rapecont=rape murdercont=murder robberycont=robbery violentcont=violent propertycont=property, by(id_conc year) fast


merge 1:1 id_conc year using crime250
drop _m

merge 1:1 id_conc year using crime500
drop _m


saveold crime.dta, replace v(13)

erase crime250.dta
erase crime500.dta

**************************************************
*Minimum distance to a 1990-2000 gentrifying block
**************************************************

use id_conc_database, clear
keep if dis_ws<=30
keep id_conc intptlat intptlon
duplicates drop id_conc, force
saveold temp, replace v(13)

use blocks_gentrified.dta, clear 
keep if gentri_9000==1
duplicates drop id_conc, force 

keep id_conc 
merge 1:1 id_conc using temp

keep if _m==3

drop _m

rename intptlat intptlat_gentri
rename intptlon intptlon_gentri
rename id_conc id_conc_gentri
gen var=1
 
saveold coord_gentri_9000, replace v(13)

use temp, clear
gen var=1
joinby var using coord_gentri_9000

gen var2 = acos(sin((intptlat*_pi)/180) * sin((intptlat_gentri*_pi)/180) + cos(abs((intptlon*_pi)/180- (intptlon_gentri*_pi)/180)) * cos(intptlat*_pi/180) * cos(intptlat_gentri*_pi/180)) * 6378.137

collapse (min) dis_gentri9000=var2, by(id_conc intptlat intptlon) fast
keep id_conc dis_gentri9000 intptlat intptlon
label var dis_gentri9000 "Min. distance to gentrifying block 9000"
saveold dis_gentri9000, replace v(13)

erase temp.dta

********************************************
*# pioneers (and their IV) at the ring-level
********************************************

use plants_id_conc.dta, clear

rename id_conc id_conc_disc250m

joinby id_conc_disc250m using id_conc_disc_250m.dta

collapse (sum) plants_pion plants_pion_instr plants_npion, by(id_conc t) fast

foreach v of var plants*{
rename `v' `v'250
}

saveold plants_id_conc250.dta, replace v(13)

use plants_id_conc.dta, clear

rename id_conc id_conc_disc500m

joinby id_conc_disc500m using id_conc_disc_500m.dta

collapse (sum) plants_pion plants_pion_instr plants_npion, by(id_conc t) fast

foreach v of var plants*{
rename `v' `v'500
}

saveold plants_id_conc500.dta,replace v(13)

use plants_id_conc.dta, clear

rename id_conc id_conc_neighbors

joinby id_conc_neighbors using id_conc_neighbors_mapping.dta

collapse (sum) plants_pion plants_pion_instr plants_npion, by(id_conc t) fast

foreach v of var plants*{
rename `v' `v'con
}

saveold plants_id_conccont.dta, replace v(13)

******************************************
*# accessible bus lines and train stations
******************************************

use id_conc_transport.dta, clear

foreach v in "num_train_lines" "num_bus_lines"{
rename `v'_id_conc `v'
}

keep id_conc num_train_lines num_bus_lines

saveold transport, replace v(13)

use transport, clear
rename id_conc id_conc_disc250m

joinby id_conc_disc250m using id_conc_disc_250m.dta

collapse (sum) num_train_lines250=num_train_lines num_bus_lines250=num_bus_lines, by(id_conc) fast

saveold transport250, replace v(13)

use transport, clear
rename id_conc id_conc_disc500m

joinby id_conc_disc500m using id_conc_disc_500m.dta

collapse (sum) num_train_lines500=num_train_lines num_bus_lines500=num_bus_lines, by(id_conc) fast

saveold transport500, replace v(13)

use transport, clear
rename id_conc id_conc_neighbors

joinby id_conc_neighbors using id_conc_neighbors_mapping.dta

collapse (sum) num_train_linescont=num_train_lines num_bus_linescont=num_bus_lines, by(id_conc) fast

merge 1:1 id_conc using transport250

foreach v in num_train_lines num_bus_lines{
replace `v'250=0 if _m==1
replace `v'cont=0 if _m==2
}
drop _m

merge 1:1 id_conc using transport500

foreach v in num_train_lines num_bus_lines{
replace `v'500=0 if _m==1
replace `v'250=0 if _m==2
replace `v'cont=0 if _m==2
}
drop _m

saveold transport, replace v(13)

erase transport250.dta
erase transport500.dta

***********************
*Distance to waterfront
***********************

use distance_idconc_water, clear
keep id_conc distance 
duplicates drop id_conc, force 
saveold water, replace v(13)

******************
*Distance to parks
******************

use landmark_area_distance_id_conc, clear
keep id_conc distance_major_parks
duplicates drop id_conc, force 
saveold dist_parks, replace v(13)

************
*# landmarks
************

use landmarks_id_conc_count, clear
rename id_conc id_conc_disc250m

joinby id_conc_disc250m using id_conc_disc_250m.dta

collapse (sum) pointlm_count250=pointlm_count, by(id_conc) fast

saveold lm_count250, replace v(13)

use landmarks_id_conc_count, clear
rename id_conc id_conc_disc500m

joinby id_conc_disc500m using id_conc_disc_500m.dta

collapse (sum) pointlm_count500=pointlm_count, by(id_conc) fast

saveold lm_count500, replace v(13)

use landmarks_id_conc_count, clear
rename id_conc id_conc_neighbors

joinby id_conc_neighbors using id_conc_neighbors_mapping.dta

collapse (sum) pointlm_countcont=pointlm_count, by(id_conc) fast

merge 1:1 id_conc using lm_count250
replace pointlm_count250=0 if _m==1
replace pointlm_countcont=0 if _m==2
drop _m

merge 1:1 id_conc using lm_count500
replace pointlm_count500=0 if _m==1
replace pointlm_countcont=0 if _m==2
replace pointlm_count250=0 if _m==2
drop _m


saveold lm_count, replace v(13)

erase lm_count250.dta
erase lm_count500.dta

*****************************************************************************
*Rent control buildings, vacant land, historical and limited height districts
*****************************************************************************

use id_conc_pluto.dta, clear

keep id_conc area_id_conc count_rc_A_id_conc_NW ltdheight_area_id_conc1 histdist_area_id_conc1 landuse_area_id_conc11

rename count_rc_A_id_conc_NW count_rc_A_NW
rename ltdheight_area_id_conc1 ltdheight_area
rename histdist_area_id_conc1 histdist_area
rename landuse_area_id_conc11 vacant_area

saveold rent_controls, replace v(13)

use rent_controls, clear

rename id_conc id_conc_disc250m

joinby id_conc_disc250m using id_conc_disc_250m.dta

collapse (sum) area_id_conc250=area_id_conc count_rc_A_NW250=count_rc_A_NW ltdheight_area250=ltdheight_area histdist_area250=histdist_area vacant_area250=vacant_area, by(id_conc) fast

gen sh_ltdheight250=ltdheight_area250/area_id_conc250
gen sh_histdist250=histdist_area250/area_id_conc250
gen sh_vacant250=vacant_area250/area_id_conc250

keep id_conc count_rc_A_NW250 sh_*
 
saveold rent_controls250, replace v(13)

use rent_controls, clear

rename id_conc id_conc_disc500m

joinby id_conc_disc500m using id_conc_disc_500m.dta

collapse (sum) area_id_conc500=area_id_conc count_rc_A_NW500=count_rc_A_NW ltdheight_area500=ltdheight_area histdist_area500=histdist_area vacant_area500=vacant_area, by(id_conc) fast

gen sh_ltdheight500=ltdheight_area500/area_id_conc500
gen sh_histdist500=histdist_area500/area_id_conc500
gen sh_vacant500=vacant_area500/area_id_conc500

keep id_conc count_rc_A_NW500 sh_*

saveold rent_controls500, replace v(13)

use rent_controls, clear

rename id_conc id_conc_neighbors

joinby id_conc_neighbors using id_conc_neighbors_mapping.dta

collapse (sum) area_id_conccont=area_id_conc count_rc_A_NWcont=count_rc_A_NW ltdheight_areacont=ltdheight_area histdist_areacont=histdist_area vacant_areacont=vacant_area, by(id_conc) fast

gen sh_ltdheightcont=ltdheight_areacont/area_id_conccont
gen sh_histdistcont=histdist_areacont/area_id_conccont
gen sh_vacantcont=vacant_areacont/area_id_conccont

keep id_conc count_rc_A_NWcont sh_*

merge 1:1 id_conc using rent_controls250

foreach v in "count_rc_A_NW" "sh_ltdheight" "sh_histdist" "sh_vacant"{
replace `v'250=0 if _m==1
replace `v'cont=0 if _m==2
}
drop _m

merge 1:1 id_conc using rent_controls500

foreach v in "count_rc_A_NW" "sh_ltdheight" "sh_histdist" "sh_vacant"{
replace `v'500=0 if _m==1
replace `v'250=0 if _m==2
replace `v'cont=0 if _m==2
}
drop _m

saveold rent_controls, replace v(13)

erase rent_controls250.dta
erase rent_controls500.dta

************************
*Database for prediction
************************

use id_conc_pop_income, clear 

drop emp* lemp* nbplants* sh_edu*

merge 1:1 id_conc year using crime.dta
drop if _m==2
drop _m

merge m:1 id_conc using dis_gentri9000
drop if _m==2
drop _m

merge 1:1 id_conc t using plants_id_conc250.dta
drop if _m==2
drop _m

merge 1:1 id_conc t using plants_id_conc500.dta
drop if _m==2
drop _m

merge 1:1 id_conc t using plants_id_conccont.dta
drop if _m==2
drop _m

merge 1:1 id_conc t using plants_id_conc.dta, keepusing(plants_pion plants_npion)
drop if _m==2
drop _m

merge m:1 id_conc using water
drop if _m==2
drop _m

merge m:1 id_conc using transport
drop if _m==2
foreach v of var num_train_linescont-num_bus_lines500{
replace `v'=0 if _m==1
}
drop _m

merge m:1 id_conc using dist_parks
drop if _m==2
drop _m

merge m:1 id_conc using lm_count
drop if _m==2
drop _m

merge m:1 id_conc using rent_controls
drop if _m==2
drop _m

*Creation of adequate dummies and log variables

foreach n in "250" "500" "cont"{
gen histdist`n'= (sh_histdist`n'!=0&sh_histdist`n'!=.)
gen ltdheight`n'= (sh_ltdheight`n'!=0&sh_ltdheight`n'!=.)
}

g waterfront=distance_water<0.2

gen ldis_gentri9000=log(dis_gentri9000*1000+1)
gen ldis_parks=log(distance_major_parks*1000+1)

foreach v of var  pointlm_* count_rc* num_train* num_bus*{
gen l`v'=log(`v'+1)
}

*Creation of various census explanatory variable

foreach v of var black asian white other edu{
gen sh_`v'=`v'/pop
gen sh_`v'250=`v'250/pop250
gen sh_`v'500=`v'500/pop500
gen sh_`v'cont=`v'cont/popcont
}

foreach n in "" "250" "500" "cont"{
gen age_build`n'=1990-med_yearb`n' if t==1 
replace age_build`n'=2000-med_yearb`n' if t==2
replace age_build`n'=2010-med_yearb`n' if t==3
}

*Creation of proxies for gentrification used as dependent variables

tsset id_conc t

gen dsh_edu=f.sh_edu-sh_edu
gen dlpc_inc=f.lpc_income-lpc_income

*Creation of pioneer explanatory variables and instruments
foreach n in "250" "500" "con"{
gen lplants_npion`n'=log(1+plants_npion`n')
}

foreach v of var plants_pion250 lplants_npion250 plants_pion500 lplants_npion500 plants_pioncon lplants_npioncon{
gen d`v'=`v'-l.`v'
}

foreach n in "250" "500" "con"{
gen dplants_pion_instr`n'=plants_pion_instr`n'-l.plants_pion`n'
}

*Creation of lagged proxies for gentrification
tsset id_conc t
foreach n in "250" "500" "con"{
gen l_dlpc_inc`n'=lpc_income`n'-l.lpc_income`n'
gen l_dsh_edu`n'=sh_edu`n'-l.sh_edu`n'
label var l_dlpc_inc`n' "Lag $\Delta$ Ln per cap. income"
label var l_dsh_edu`n' "Lag $\Delta$ Share college edu."
}

gen l_gentri_0010250=ldis_gentri9000
gen l_gentri_0010500=ldis_gentri9000
gen l_gentri_0010con=ldis_gentri9000

label var l_gentri_0010250 "Ln dist. to closest gentrifying block 1990-2000"
label var l_gentri_0010500 "Ln dist. to closest gentrifying block 1990-2000"
label var l_gentri_0010con "Ln dist. to closest gentrifying block 1990-2000"

*Labelling of all the variables
label var waterfront "Less than 200m from waterfront"
label var ldis_park "Distance to closest park (log)"
label var lcrent "Ln rent"

label var lpc_income250 "Ln per cap. income"
label var lpop250 "Ln pop."
label var sh_edu250 "Share college edu. resid."
label var sh_black250 "Share black resid."
label var sh_asian250 "Share asian resid."
label var sh_white250 "Share white resid."
label var sh_other250 "Share other resid."
label var dplants_pion250 "$\Delta$ \# pioneer plants"
label var dlplants_npion250 "$\Delta$ Ln (1+ \# non pioneer plants)"
label var lnum_train_lines250 "Ln (1+\# train lines)"
label var lnum_bus_lines250 "Ln (1+\# bus lines)"
label var lcount_rc_A_NW250 "Ln (1+\# rent control buildings)"
label var age_build250 "Median age of buildings"
label var lpointlm_count250 "Ln \# of main landmarks"
label var ltdheight250 "Presence of limited height dtsricts"
label var histdist250 "Presence of historical districts"
label var sh_vacant250 "Share vacant land"
label var murder250 "\# murder per cap."
label var robbery250 "\# robbery per cap."
label var burglary250 "\# burglary per cap."
label var rape250 "\# rape per cap."

label var lpc_income500 "Ln per cap. income"
label var lpop500 "Ln pop."
label var sh_edu500 "Share college edu. resid."
label var sh_black500 "Share black resid."
label var sh_asian500 "Share asian resid."
label var sh_white500 "Share white resid."
label var sh_other500 "Share other resid."
label var dplants_pion500 "$\Delta$ \# pioneer plants"
label var dlplants_npion500 "$\Delta$ Ln (1+ \# non pioneer plants)"
label var lnum_train_lines500 "Ln (1+\# train lines)"
label var lnum_bus_lines500 "Ln (1+\# bus lines)"
label var lcount_rc_A_NW500 "Ln (1+\# rent control buildings)"
label var age_build500 "Median age of buildings"
label var lpointlm_count500 "Ln \# of main landmarks"
label var ltdheight500 "Presence of limited height dtsricts"
label var histdist500 "Presence of historical districts"
label var sh_vacant500 "Share vacant land"
label var murder500 "\# murder per cap."
label var robbery500 "\# robbery per cap."
label var burglary500 "\# burglary per cap."
label var rape500 "\# rape per cap."

label var lpc_incomecon "Ln per cap. income"
label var lpopcon "Ln pop."
label var sh_educon "Share college edu. resid."
label var sh_blackcon "Share black resid."
label var sh_asiancon "Share asian resid."
label var sh_whitecon "Share white resid."
label var sh_othercon "Share other resid."
label var dplants_pioncon "$\Delta$ \# pioneer plants"
label var dlplants_npioncon "$\Delta$ Ln (1+ \# non pioneer plants)"
label var lnum_train_linescon "Ln (1+\# train lines)"
label var lnum_bus_linescon "Ln (1+\# bus lines)"
label var lcount_rc_A_NWcon "Ln (1+\# rent control buildings)"
label var age_buildcon "Median age of buildings"
label var lpointlm_countcon "Ln \# of main landmarks"
label var ltdheightcon "Presence of limited height dtsricts"
label var histdistcon "Presence of historical districts"
label var sh_vacantcon "Share vacant land"
label var murdercont "\# murder per cap."
label var robberycont "\# robbery per cap."
label var burglarycont "\# burglary per cap."
label var rapecont "\# rape per cap."


merge m:1 id_conc using blocks_couldgentrify.dta, keepusing(poor_9000 poor_0010)

keep if (poor_9000==1 |   poor_0010==1) &_m==3
drop _m

merge m:1 id_conc using blocks_gentrified
drop if _m==2
replace gentri_9000=0 if _m==1 
replace gentri_0010=0 if _m==1
drop _m

saveold predict_gentri_database, replace v(13)
